import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
dataset = pd.read_csv('train.csv', index_col=0, header=[0, 1])
dataset.rename(
columns={
'Unnamed: 209_level_1': 'count',
'Unnamed: 210_level_1': 'price',
},
level = 1,
inplace = True
)
dataset.head()
| timestamp_second | bid_price_0 | bid_quantity_0 | ... | sell_trade_quantity | count | price | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| nunique | mean | median | std | min | max | first | last | sum | nunique | ... | nunique | mean | median | std | min | max | first | last | count | price | |
| 0 | 7 | 44.183379 | 44.1865 | 0.007340 | 44.171 | 44.190 | 44.171 | 44.190 | 25880.7 | 5 | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 66 | 44.2005 |
| 1 | 1 | 44.190000 | 44.1900 | 0.000000 | 44.190 | 44.190 | 44.190 | 44.190 | 76.7 | 1 | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 13 | 44.2195 |
| 2 | 3 | 44.187467 | 44.1860 | 0.001767 | 44.186 | 44.190 | 44.190 | 44.186 | 597.5 | 3 | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 15 | 44.2195 |
| 3 | 3 | 44.181038 | 44.1860 | 0.006993 | 44.173 | 44.188 | 44.186 | 44.188 | 2062.8 | 2 | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 52 | 44.2175 |
| 4 | 2 | 44.213095 | 44.2190 | 0.012474 | 44.188 | 44.219 | 44.188 | 44.219 | 6350.8 | 2 | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 21 | 44.2060 |
5 rows × 210 columns
dataset_test = pd.read_csv('test_public.csv', index_col=0, header=[0, 1])
dataset_test.rename(
columns={
'Unnamed: 209_level_1': 'count',
'Unnamed: 210_level_1': 'price',
},
level = 1,
inplace = True
)
dataset_test.head()
| timestamp_second | bid_price_0 | bid_quantity_0 | ... | sell_trade_quantity | count | price | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| nunique | mean | median | std | min | max | first | last | sum | nunique | ... | nunique | mean | median | std | min | max | first | last | count | price | |
| 1300000 | 8 | 30.814385 | 30.814 | 0.002522 | 30.810 | 30.819 | 30.810 | 30.819 | 6053.2 | 6 | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 52 | 30.8295 |
| 1300001 | 4 | 30.821468 | 30.820 | 0.002858 | 30.819 | 30.826 | 30.819 | 30.826 | 7911.8 | 9 | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 47 | 30.8380 |
| 1300002 | 8 | 30.831167 | 30.833 | 0.002328 | 30.826 | 30.833 | 30.826 | 30.833 | 9012.1 | 8 | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 48 | 30.8500 |
| 1300003 | 12 | 30.848835 | 30.843 | 0.015865 | 30.833 | 30.872 | 30.833 | 30.844 | 13234.6 | 12 | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 91 | 30.8605 |
| 1300004 | 7 | 30.851372 | 30.860 | 0.013284 | 30.810 | 30.861 | 30.844 | 30.835 | 7546.1 | 7 | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 78 | 30.8720 |
5 rows × 210 columns
pip install catboost
Requirement already satisfied: catboost in c:\users\amega\anaconda\lib\site-packages (1.1.1)Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: six in c:\users\amega\anaconda\lib\site-packages (from catboost) (1.16.0) Requirement already satisfied: scipy in c:\users\amega\anaconda\lib\site-packages (from catboost) (1.7.3) Requirement already satisfied: graphviz in c:\users\amega\anaconda\lib\site-packages (from catboost) (0.20.1) Requirement already satisfied: plotly in c:\users\amega\anaconda\lib\site-packages (from catboost) (5.9.0) Requirement already satisfied: pandas>=0.24.0 in c:\users\amega\anaconda\lib\site-packages (from catboost) (1.4.3) Requirement already satisfied: matplotlib in c:\users\amega\anaconda\lib\site-packages (from catboost) (3.5.2) Requirement already satisfied: numpy>=1.16.0 in c:\users\amega\anaconda\lib\site-packages (from catboost) (1.21.5) Requirement already satisfied: pytz>=2020.1 in c:\users\amega\anaconda\lib\site-packages (from pandas>=0.24.0->catboost) (2022.1) Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\amega\anaconda\lib\site-packages (from pandas>=0.24.0->catboost) (2.8.2) Requirement already satisfied: pyparsing>=2.2.1 in c:\users\amega\anaconda\lib\site-packages (from matplotlib->catboost) (3.0.9) Requirement already satisfied: packaging>=20.0 in c:\users\amega\anaconda\lib\site-packages (from matplotlib->catboost) (21.3) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\amega\anaconda\lib\site-packages (from matplotlib->catboost) (1.4.2) Requirement already satisfied: cycler>=0.10 in c:\users\amega\anaconda\lib\site-packages (from matplotlib->catboost) (0.11.0) Requirement already satisfied: pillow>=6.2.0 in c:\users\amega\anaconda\lib\site-packages (from matplotlib->catboost) (9.2.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\amega\anaconda\lib\site-packages (from matplotlib->catboost) (4.25.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\amega\anaconda\lib\site-packages (from plotly->catboost) (8.0.1)
from catboost import CatBoostRegressor
cat = CatBoostRegressor()
need_col_vol = [col for col in dataset.columns.tolist() if col[1] in ['sum','nunique','mean','std'] and col[0].find('price') == -1]
need_col_list = [
('buy_trade_price', 'count'),
('buy_trade_price', 'nunique'),
('sell_trade_price', 'count'),
('sell_trade_price', 'nunique'),
('count', 'count'),
('buy_trade_price_weight', ''),
('sell_trade_price_weight', ''),
('dif_trade_price_weight', ''),
('VolImbalance', ''),
('BAspread', ''),
('bid_price_3_minmax_dif', ''),
('bid_price_1_minmax_dif', ''),
('bid_price_0_minmax_dif', ''),
('bid_price_2_minmax_dif', ''),
('bid_price_4_minmax_dif', ''),
('ask_price_4_minmax_dif', ''),
('ask_price_1_minmax_dif', ''),
('ask_price_0_minmax_dif', ''),
('ask_price_3_minmax_dif', ''),
('ask_price_2_minmax_dif', ''),
('spread_1', ''),
('spread_2', ''),
('spread_3', ''),
('spread_4', ''),
('sum_dif_spread', ''),
('Vspread_0', ''),
('Vspread_1', ''),
('Vspread_2', ''),
('Vspread_3', ''),
('Vspread_4', ''),
('sum_dif_vspread_5', ''),
('sum_dif_vspread_1', ''),
('sum_dif_vspread_2', ''),
('sum_dif_vspread_3', ''),
('sum_dif_vspread_4', ''),
('bid_dif', ''),
('ask_dif', ''),
('bid_dif_0', ''),
('ask_dif_0', ''),
('bid_dif_1', ''),
('ask_dif_1', ''),
('bid_dif_2', ''),
('ask_dif_2', ''),
('bid_dif_3', ''),
('ask_dif_3', ''),
('ammount_dif0', ''),
('ammount_dif1', ''),
('ammount_dif2', ''),
('ammount_dif3', ''),
('ammount_dif4', ''),
('ammount_dif_5', ''),
('range_bid', ''),
('range_ask', ''),
('baratio_0', ''),
('paratio_0', ''),
('bratio', ''),
('buy_vol_count_ratio_0', ''),
('sell_vol_count_ratio_0', ''),
('baratio_1', ''),
('paratio_1', ''),
('buy_vol_count_ratio_1', ''),
('sell_vol_count_ratio_1', ''),
('baratio_2', ''),
('paratio_2', ''),
('buy_vol_count_ratio_2', ''),
('sell_vol_count_ratio_2', ''),
('baratio_3', ''),
('paratio_3', ''),
('buy_vol_count_ratio_3', ''),
('sell_vol_count_ratio_3', ''),
('baratio_4', ''),
('paratio_4', ''),
('buy_vol_count_ratio_4', ''),
('sell_vol_count_ratio_4', '')]
need_col_list_new = list(set(need_col_list + need_col_vol + [('buy_trade_price', 'std'),('sell_trade_price', 'std'),('buy_trade_dif',''),('sell_trade_dif',''),\
('buy_price_count_ratio',''),('buy_price_count_ratio','')]))
def new_batch_v5(X):
y = X[('price','price')].copy()
df = X.copy()
col_na = ['count', 'nunique', 'mean', 'median', 'std', 'min', 'max']
for el in col_na:
df.loc[df[('buy_trade_quantity','count')] == 0,('buy_trade_quantity',el)] = 0
df.loc[df[('sell_trade_quantity','count')] == 0,('sell_trade_quantity',el)] = 0
df.loc[df[('sell_trade_quantity','count')] == 1,('sell_trade_quantity','std')] = 0
df.loc[df[('sell_trade_price','count')] == 1,('sell_trade_price','std')] = 0
df.loc[df[('buy_trade_quantity','count')] == 1,('buy_trade_quantity','std')] = 0
df.loc[df[('buy_trade_price','count')] == 1,('buy_trade_price','std')] = 0
df.loc[df[('sell_trade_quantity','count')] == 1,('sell_trade_quantity','std')] = 0
df.loc[df[('sell_trade_price','count')] == 1,('sell_trade_price','std')] = 0
df.loc[df[('buy_trade_quantity','count')] == 1,('buy_trade_quantity','std')] = 0
df.loc[df[('buy_trade_price','count')] == 1,('buy_trade_price','std')] = 0
df['buy_trade_price_weight'] = df[('buy_trade_price','mean')] * df[('buy_trade_quantity','sum')]
df['sell_trade_price_weight'] = df[('sell_trade_price','mean')] * df[('sell_trade_quantity','sum')]
df['dif_trade_price_weight'] = df['buy_trade_price_weight'] - df['sell_trade_price_weight']
df['VolImbalance'] = df[('bid_quantity_0', 'sum')] / (df[('ask_quantity_0', 'sum')] + df[('bid_quantity_0', 'sum')])
df['BAspread'] = df[('ask_price_0', 'mean')] - df[('bid_price_0', 'mean')]
bid_price_list = ['bid_price_0', 'bid_price_1', 'bid_price_2', 'bid_price_3', 'bid_price_4']
ask_price_list = ['ask_price_0', 'ask_price_1', 'ask_price_2', 'ask_price_3', 'ask_price_4']
bid_quantity_list = ['bid_quantity_0','bid_quantity_1','bid_quantity_2','bid_quantity_3','bid_quantity_4']
ask_quantity_list = ['ask_quantity_0','ask_quantity_1','ask_quantity_2','ask_quantity_3','ask_quantity_4']
bid_list = ['bid_price_3', 'bid_price_1', 'bid_price_0', 'bid_price_2', 'bid_price_4']
ask_list = ['ask_price_4', 'ask_price_1', 'ask_price_0', 'ask_price_3', 'ask_price_2']
for el in bid_list:
df[el + '_minmax_dif'] = df[(el,'max')] - df[(el,'min')]
for el in ask_list:
df[el + '_minmax_dif'] = df[(el,'max')] - df[(el,'min')]
for i in range(1,5):
df['spread_' + str(i)] = df[('ask_price_' + str(i),'mean')] - df[('bid_price_' + str(i),'mean')]
df['sum_dif_spread'] = df.iloc[:,-5:].sum(axis=1)
for i in range(5):
df['Vspread_' + str(i)] = df[('ask_quantity_' + str(i),'sum')] - df[('bid_quantity_' + str(i),'sum')]
df['sum_dif_vspread_5'] = df.iloc[:,-5:].sum(axis=1)
dict_volume_cum_spread = {}
for i in range(1,5):
dict_volume_cum_spread['sum_dif_vspread_' + str(i)] = df.iloc[:,-6:-5+i].sum(axis=1)
for i in range(1,5):
df['sum_dif_vspread_' + str(i)] = dict_volume_cum_spread['sum_dif_vspread_' + str(i)]
df['bid_dif'] = df[('bid_price_0','mean')] - df[('bid_price_4','mean')]
df['ask_dif'] = df[('ask_price_4','mean')] - df[('ask_price_0','mean')]
for i in range(4):
df['bid_dif_' + str(i)] = abs(df[('bid_price_' + str(i),'mean')] - df[('bid_price_' + str(i + 1),'mean')])
df['ask_dif_' + str(i)] = abs(df[('ask_price_' + str(i+1),'mean')] - df[('ask_price_' + str(i),'mean')])
for i in range(len(bid_price_list)):
df[bid_price_list[i] + 'ammount'] = df[(bid_price_list[i],'mean')] * df[(bid_quantity_list[i],'sum')]
df[ask_price_list[i] + 'ammount'] = df[(ask_price_list[i],'mean')] * df[(ask_quantity_list[i],'sum')]
for i in range(len(bid_price_list)):
df['ammount_dif' + str(i)] = df[bid_price_list[i] + 'ammount'] - df[ask_price_list[i] + 'ammount']
df['ammount_dif_5'] = df.iloc[:,-5:].sum(axis=1)
dict_ammount_dif_cum_spread = {}
for i in range(1,5):
dict_ammount_dif_cum_spread['ammount_dif' + str(i)] = df.iloc[:,-6:-5+i].sum(axis=1)
for i in range(1,5):
df['ammount_dif' + str(i)] = dict_ammount_dif_cum_spread['ammount_dif' + str(i)]
df['range_bid'] = df[('bid_price_0','max')] - df[('bid_price_4','min')]
df['range_ask'] = df[('ask_price_4','max')] - df[('ask_price_0','min')]
df['buy_trade_dif'] = df[('buy_trade_price','max')] - df[('buy_trade_price','min')]
df['sell_trade_dif'] = df[('sell_trade_price','max')] - df[('sell_trade_price','min')]
for i in range(5):
df['baratio_' + str(i)] = df[('bid_quantity_' + str(i),'sum')] / df[('ask_quantity_' + str(i),'sum')]
df['paratio_' + str(i)] = df[('bid_price_' + str(i),'mean')] / df[('ask_price_' + str(i),'mean')]
df['bratio'] = df[[('bid_quantity_' + str(i),'sum') for i in range(5)]].sum(axis=1) / df[[('ask_quantity_' + str(i),'sum') for i in range(5)]].sum(axis=1)
df['buy_vol_count_ratio_' + str(i)] = df[('buy_trade_quantity','sum')] / df[('count','count')]
df['sell_vol_count_ratio_' + str(i)] = df[('sell_trade_quantity','sum')] / df[('count','count')]
df['buy_price_count_ratio'] = df[('buy_trade_price','count')] / df[('count','count')]
df['sell_price_count_ratio'] = df[('sell_trade_price','count')] / df[('count','count')]
df['ema_200'] = y.ewm(span=200).mean()
df['ema_200_ratio'] = df['ema_200'] / y
df['ema_80'] = y.ewm(span=80).mean()
df['ema_80_ratio'] = df['ema_80'] / y
df['delta_1'] = df['ema_200'] - y
df['delta_2'] = df['ema_80'] - y
df['delta_3'] = df['ema_200'] - df['ema_80']
df['delta_1_ratio'] = df['delta_1'] / y
df['delta_2_ratio'] = df['delta_2'] / y
df['delta_3_ratio'] = df['delta_3'] / df['ema_80']
df['pct_1'] = y.pct_change(1)
df['mean_60_pct_1'] = df['pct_1'].rolling(window=60).mean()
df['mean_10_pct_1'] = df['pct_1'].rolling(window=10).mean()
df['pct_60'] = y.pct_change(60)
df['dif_60'] = y - y.shift(60)
df['dif_1'] = y - y.shift(1)
df['price'] = y
df['shift_price'] = y.shift(-60)
df['dif_price'] = (y.shift(-60) - y)
df['log_dif_price'] = (np.log(y.shift(-60) / y))
df['bid_price_0_shift'] = df[('bid_price_0','max')].shift(-60)
df['ask_price_0_shift'] = df[('ask_price_0','min')].shift(-60)
df['bid_price_0_shift_v2'] = df[('bid_price_0','first')].shift(-60)
df['ask_price_0_shift_v2'] = df[('ask_price_0','first')].shift(-60)
df['price_shift_max'] = y.rolling(window=60).max().shift(-60)
df['price_shift_min'] = y.rolling(window=60).min().shift(-60)
new_columns = ['ema_200','ema_200_ratio','ema_80','ema_80_ratio','delta_1','delta_2','delta_3','delta_1_ratio',\
'delta_2_ratio','delta_3_ratio','mean_60_pct_1','mean_10_pct_1','pct_60','pct_1',\
'dif_60','dif_1','shift_price','dif_price','log_dif_price']
return df[need_col_list_new + [(el,'') for el in new_columns] + [('price','price')] + \
[('bid_price_0','first'),('bid_price_0_shift_v2',''),('ask_price_0','first'),('ask_price_0_shift_v2',''),\
('price_shift_max',''),('price_shift_min','')]]
dataset_new5 = new_batch_v5(dataset)
dataset_test_new5 = new_batch_v5(dataset_test)
def get_pnl_v4(dataset_old,dataset_oot,train_ind,test_ind,params,T_list,type_='dif'):
X_train = dataset_old.iloc[:,:127][dataset_old.index.isin(train_ind)]
X_test = dataset_old.iloc[:,:127][dataset_old.index.isin(test_ind)]
X_oot = dataset_oot.iloc[:,:127]
y_old = dataset_old[('price','price')]
y_old_60 = dataset_old[('shift_price','')]
y_oot = dataset_oot[('price','price')]
y_train = y_old[y_old.index.isin(train_ind)]
y_test = y_old[y_old.index.isin(test_ind)]
y_train_60 = y_old_60[y_old_60.index.isin(train_ind)]
y_test_60 = y_old_60[y_old_60.index.isin(test_ind)]
y_oot_60 = dataset_oot[('shift_price','')]
y_dif = dataset_old[('dif_price','')]
y_dif_train = y_dif[dataset_old.index.isin(train_ind)]
y_dif_test = y_dif[dataset_old.index.isin(test_ind)]
y_dif_oot = dataset_oot[('dif_price','')]
y_log_dif = dataset_old[('log_dif_price','')]
y_log_dif_train = y_log_dif[dataset_old.index.isin(train_ind)]
y_log_dif_test = y_log_dif[dataset_old.index.isin(test_ind)]
y_log_dif_oot = dataset_oot[('log_dif_price','')]
hitrate_train_1_list = []
hitrate_test_1_list = []
hitrate_oot_1_list = []
param_dict = {'early_stopping_rounds' : 30,
'thread_count' : -1,
'loss_function' : 'RMSE'
}
param_dict = dict(list(param_dict.items()) + params)
estimator = CatBoostRegressor(**param_dict)
if type_ == 'dif':
# dif_ratio
estimator.fit(X_train,y_dif_train,verbose=False)
y_train_pred = estimator.predict(X_train)
y_test_pred = estimator.predict(X_test)
y_oot_pred = estimator.predict(X_oot)
y_train_pred_1 = y_train + y_train_pred
y_test_pred_1 = y_test + y_test_pred
y_oot_pred_1 = y_oot + y_oot_pred
df_train = pd.DataFrame({'pred':y_train_pred_1 - y_train,\
'buy_true' :-dataset_old[dataset_old.index.isin(train_ind)][('ask_price_0','first')] + \
dataset_old[dataset_old.index.isin(train_ind)][('bid_price_0_shift_v2','')],\
'sell_true' : dataset_old[dataset_old.index.isin(train_ind)][('bid_price_0','first')] - \
dataset_old[dataset_old.index.isin(train_ind)][('ask_price_0_shift_v2','')]})
df_test = pd.DataFrame({'pred':y_test_pred_1 - y_test,\
'buy_true' :-dataset_old[dataset_old.index.isin(test_ind)][('ask_price_0','first')] + \
dataset_old[dataset_old.index.isin(test_ind)][('bid_price_0_shift_v2','')],\
'sell_true' : dataset_old[dataset_old.index.isin(test_ind)][('bid_price_0','first')] - \
dataset_old[dataset_old.index.isin(test_ind)][('ask_price_0_shift_v2','')]})
df_oot = pd.DataFrame({'pred':y_oot_pred_1 - y_oot,\
'buy_true' :-dataset_oot[('ask_price_0','first')] + \
dataset_oot[('bid_price_0_shift_v2','')],\
'sell_true' : dataset_oot[('bid_price_0','first')] - \
dataset_oot[('ask_price_0_shift_v2','')]})
else :
estimator.fit(X_train,y_log_dif_train,verbose=False)
y_train_pred = estimator.predict(X_train)
y_test_pred = estimator.predict(X_test)
y_oot_pred = estimator.predict(X_oot)
y_train_pred_1 = y_train * np.exp(y_train_pred)
y_test_pred_1 = y_test * np.exp(y_test_pred)
y_oot_pred_1 = y_oot * np.exp(y_oot_pred)
df_train = pd.DataFrame({'pred':y_train_pred_1 - y_train,\
'buy_true' :-dataset_old[dataset_old.index.isin(train_ind)][('ask_price_0','first')] + \
dataset_old[dataset_old.index.isin(train_ind)][('bid_price_0_shift_v2','')],\
'sell_true' : dataset_old[dataset_old.index.isin(train_ind)][('bid_price_0','first')] - \
dataset_old[dataset_old.index.isin(train_ind)][('ask_price_0_shift_v2','')]})
df_test = pd.DataFrame({'pred':y_test_pred_1 - y_test,\
'buy_true' :-dataset_old[dataset_old.index.isin(test_ind)][('ask_price_0','first')] + \
dataset_old[dataset_old.index.isin(test_ind)][('bid_price_0_shift_v2','')],\
'sell_true' : dataset_old[dataset_old.index.isin(test_ind)][('bid_price_0','first')] - \
dataset_old[dataset_old.index.isin(test_ind)][('ask_price_0_shift_v2','')]})
df_oot = pd.DataFrame({'pred':y_oot_pred_1 - y_oot,\
'buy_true' :-dataset_oot[('ask_price_0','first')] + \
dataset_oot[('bid_price_0_shift_v2','')],\
'sell_true' : dataset_oot[('bid_price_0','first')] - \
dataset_oot[('ask_price_0_shift_v2','')]})
train_list = []
test_list = []
oot_list = []
for T in T_list:
df_train_T_stats = get_df_total_stats_v2(df_train,T)
df_test_T_stats = get_df_total_stats_v2(df_test,T)
df_oot_T_stats = get_df_total_stats_v2(df_oot,T)
train_list.append(df_train_T_stats)
test_list.append(df_test_T_stats)
oot_list.append(df_oot_T_stats)
return train_list,test_list,oot_list
def get_params_list(grid,ind_list):
param_list = [[] for i in range(len(ind_list))]
df_temp = grid.iloc[ind_list].copy()
for i in range(df_temp.shape[0]):
param_list[i].append(('iterations',df_temp.iloc[i,-7]))
param_list[i].append(('depth',df_temp.iloc[i,-6]))
param_list[i].append(('learning_rate',df_temp.iloc[i,-5]))
param_list[i].append(('l2_leaf_reg',df_temp.iloc[i,-4]))
param_list[i].append(('rsm',df_temp.iloc[i,-3]))
param_list[i].append(('subsample',1))
param_list[i].append(('min_data_in_leaf', 2000))
return param_list
param_list_1_2 = get_params_list(grid_upd_1,top_ind_1_2)
param_list_2_1 = get_params_list(grid_upd_2,top_ind_2_1)
param_list_2_2 = get_params_list(grid_upd_2,top_ind_2_2)
def get_stats_t_v4(dataset_old,dataset_oot,train_ind,test_ind,params,type_='dif'):
X_train = dataset_old.iloc[:,:127][dataset_old.index.isin(train_ind)]
X_test = dataset_old.iloc[:,:127][dataset_old.index.isin(test_ind)]
X_oot = dataset_oot.iloc[:,:127]
y_old = dataset_old[('price','price')]
y_old_60 = dataset_old[('shift_price','')]
y_oot = dataset_oot[('price','price')]
y_train = y_old[y_old.index.isin(train_ind)]
y_test = y_old[y_old.index.isin(test_ind)]
y_train_60 = y_old_60[y_old_60.index.isin(train_ind)]
y_test_60 = y_old_60[y_old_60.index.isin(test_ind)]
y_oot_60 = dataset_oot[('shift_price','')]
y_dif = dataset_old[('dif_price','')]
y_dif_train = y_dif[dataset_old.index.isin(train_ind)]
y_dif_test = y_dif[dataset_old.index.isin(test_ind)]
y_dif_oot = dataset_oot[('dif_price','')]
y_log_dif = dataset_old[('log_dif_price','')]
y_log_dif_train = y_log_dif[dataset_old.index.isin(train_ind)]
y_log_dif_test = y_log_dif[dataset_old.index.isin(test_ind)]
y_log_dif_oot = dataset_oot[('log_dif_price','')]
param_dict = {'early_stopping_rounds' : 30,
'thread_count' : -1,
'loss_function' : 'RMSE'
}
param_dict = dict(list(param_dict.items()) + params)
estimator = CatBoostRegressor(random_state=42,**param_dict)
if type_ == 'dif':
# dif_ratio
estimator.fit(X_train,y_dif_train,verbose=False)
y_train_pred = estimator.predict(X_train)
y_test_pred = estimator.predict(X_test)
y_oot_pred = estimator.predict(X_oot)
y_train_pred_1 = y_train + y_train_pred
y_test_pred_1 = y_test + y_test_pred
y_oot_pred_1 = y_oot + y_oot_pred
else :
estimator.fit(X_train,y_log_dif_train,verbose=False)
y_train_pred = estimator.predict(X_train)
y_test_pred = estimator.predict(X_test)
y_oot_pred = estimator.predict(X_oot)
y_train_pred_1 = y_train * np.exp(y_train_pred)
y_test_pred_1 = y_test * np.exp(y_test_pred)
y_oot_pred_1 = y_oot * np.exp(y_oot_pred)
print(mse(y_train_60,y_train_pred_1),mse(y_test_60,y_test_pred_1),mse(y_oot_60,y_oot_pred_1))
df_train = pd.DataFrame({'pred':y_train_pred_1 - y_train,'bid_price' : dataset_old[dataset_old.index.isin(train_ind)][('bid_price_0','first')],\
'ask_price' : dataset_old[dataset_old.index.isin(train_ind)][('ask_price_0','first')], 'true' : dataset_old[dataset_old.index.isin(train_ind)][('dif_price','')],\
'buy_true' :-dataset_old[dataset_old.index.isin(train_ind)][('ask_price_0','first')] + \
dataset_old[dataset_old.index.isin(train_ind)][('bid_price_0_shift_v2','')],\
'sell_true' : dataset_old[dataset_old.index.isin(train_ind)][('bid_price_0','first')] - \
dataset_old[dataset_old.index.isin(train_ind)][('ask_price_0_shift_v2','')]})
df_test = pd.DataFrame({'pred':y_test_pred_1 - y_test,'bid_price' : dataset_old[dataset_old.index.isin(test_ind)][('bid_price_0','first')],\
'ask_price' : dataset_old[dataset_old.index.isin(test_ind)][('ask_price_0','first')], 'true' : dataset_old[dataset_old.index.isin(test_ind)][('dif_price','')],\
'buy_true' :-dataset_old[dataset_old.index.isin(test_ind)][('ask_price_0','first')] + \
dataset_old[dataset_old.index.isin(test_ind)][('bid_price_0_shift_v2','')],\
'sell_true' : dataset_old[dataset_old.index.isin(test_ind)][('bid_price_0','first')] - \
dataset_old[dataset_old.index.isin(test_ind)][('ask_price_0_shift_v2','')]})
df_oot = pd.DataFrame({'pred':y_oot_pred_1 - y_oot,'bid_price' : dataset_oot[('bid_price_0','first')],\
'ask_price' : dataset_oot[('ask_price_0','first')], 'true' : dataset_oot[('dif_price','')],\
'buy_true' :-dataset_oot[('ask_price_0','first')] + \
dataset_oot[('bid_price_0_shift_v2','')],\
'sell_true' : dataset_oot[('bid_price_0','first')] - \
dataset_oot[('ask_price_0_shift_v2','')]})
return df_train,df_test,df_oot
get_mse_boosting_df_v3(grid_upd_2_1)
| train_1 | test_1 | htrain_1 | htest_1 | train_2 | test_2 | htrain_2 | htest_2 | iter | depth | lr | l2 | rsm | dif_1 | dif_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.005621 | 0.006112 | 0.5359 | 0.5325 | 0.005622 | 0.006113 | 0.5356 | 0.5332 | 100 | 3 | 0.10 | 5 | 0.8 | -0.000490 | -0.000491 |
| 1 | 0.005621 | 0.006109 | 0.5357 | 0.5336 | 0.005620 | 0.006111 | 0.5358 | 0.5335 | 100 | 4 | 0.07 | 5 | 1.0 | -0.000489 | -0.000491 |
| 2 | 0.005621 | 0.006111 | 0.5357 | 0.5330 | 0.005621 | 0.006111 | 0.5358 | 0.5325 | 100 | 3 | 0.10 | 3 | 1.0 | -0.000490 | -0.000490 |
| 3 | 0.005620 | 0.006110 | 0.5359 | 0.5334 | 0.005620 | 0.006110 | 0.5358 | 0.5341 | 100 | 4 | 0.07 | 5 | 0.8 | -0.000490 | -0.000490 |
| 4 | 0.005627 | 0.006112 | 0.5348 | 0.5332 | 0.005627 | 0.006112 | 0.5351 | 0.5329 | 100 | 3 | 0.07 | 5 | 0.8 | -0.000484 | -0.000484 |
df_tr,df_tt,df_ot = get_stats_t_v4(dataset_new5,dataset_test_new5.iloc[200:-200],\
np.array([range(200 + 7500 * k,7500 * k + 6200) for k in range(173)]).flatten(),\
np.array([range(6200 + 7500 * k,7500 * k + 7700) for k in range(173)]).flatten(),param_list_2_1[1],type_='dif')
0.005620676749994442 0.00610948297817269 0.010233244648599264
def step_1_v2(df_bid_ask,y_pred,T=0.1,S=1,M=1000,C=0):
df = pd.DataFrame({'pred':y_pred})
df[['ask_price','bid_price']] = df_bid_ask
fee_list = []
timestamp_list = []
trade_price_list = []
side_list = []
quantity_list = []
M_value = 0
df_current = df[abs(df.pred) > T].copy()
ind_list_ = df_current.index
for i in ind_list_:
if df_current.loc[i,'pred'] > T and M_value < M:
try :
cur_price = df_current.loc[i,'ask_price']
next_price = df.loc[i+60,'bid_price']
timestamp_list.append(i)
trade_price_list.append(cur_price)
side_list.append('buy')
quantity_list.append(S)
fee_list.append(trade_price_list[-1] * quantity_list[-1] * C)
M_value +=1
timestamp_list.append(i+60)
trade_price_list.append(next_price)
side_list.append('sell')
quantity_list.append(S)
fee_list.append(trade_price_list[-1] * quantity_list[-1] * C)
except Exception:
pass
if df_current.loc[i,'pred'] < -T and M_value > -M:
try :
cur_price = df_current.loc[i,'bid_price']
next_price = df.loc[i+60,'ask_price']
timestamp_list.append(i)
trade_price_list.append(cur_price)
side_list.append('sell')
quantity_list.append(S)
fee_list.append(trade_price_list[-1] * quantity_list[-1] * C)
M_value -=1
timestamp_list.append(i+60)
trade_price_list.append(next_price)
side_list.append('buy')
quantity_list.append(S)
fee_list.append(trade_price_list[-1] * quantity_list[-1] * C)
except Exception:
pass
df_new = pd.DataFrame({'timestamp' : timestamp_list, 'trade_price' : trade_price_list, 'trade_quantity' : quantity_list,\
'side' : side_list,'fee' : fee_list})
return df_new
def get_maxdown(list_):
cur_sum = []
sum_ = 0
for el in list_:
sum_ += el
cur_sum.append(sum_)
if sum_ < 0:
sum_ = 0
return max(cur_sum)
def step_2_v4(df_bid_ask,y_pred,T=0.1,S=1,M=1000,C=0):
df_pnl = step_1_v2(df_bid_ask,y_pred,T,S,M,C)
if df_pnl.shape[0] == 0:
return 0
df_pnl['sign_side'] = df_pnl.side.apply(lambda x : -1 if x == 'sell' else 1)
df_pnl['current_num'] = df_pnl.apply(lambda x : x.trade_quantity * x.sign_side ,axis=1).cumsum()
df_pnl['pnl'] = ((df_pnl['trade_price'] - df_pnl['trade_price'].shift(1))* df_pnl.current_num.shift(1)).fillna(0)
df_pnl['pnl_fee'] = df_pnl['pnl'] - df_pnl['fee']
df_pnl['pnl_fee_v2'] = df_pnl['pnl_fee'].rolling(2).sum()
df_pnl['pnl_2'] = df_pnl['pnl'].cumsum()
pnl_list = df_pnl.loc[1::2,'pnl_fee_v2']
total_pnl = pnl_list.sum()
mean_pnl = pnl_list.mean()
std_pnl = pnl_list.std()
sharp_ratio = mean_pnl / std_pnl
volume = (df_pnl.trade_price * df_pnl.trade_quantity).sum()
trade_num = pnl_list.shape[0]
positive_trade_ratio = (pnl_list > 0).sum() / trade_num
max_drop_down = get_maxdown(-pnl_list)
return dict(zip('total_pnl,mean_pnl,std_pnl,sharp_ratio,volume,trade_num,positive_trade_ratio,max_drop_down'.split(','),np.round([total_pnl,mean_pnl,std_pnl,sharp_ratio,volume,trade_num,positive_trade_ratio,max_drop_down],5)))
%%time
T_list = np.arange(0.01,0.11,0.001)
S = 1
C = 0
train_stats_list = []
test_stats_list = []
M_list = np.logspace(1,4,10)
for T_cur in T_list:
T = T_cur
for M_cur in M_list:
M = M_cur
try :
test_stats = step_2_v4(df_tt[['ask_price','bid_price']],df_tt.pred,T,S,M,C)
train_stats = step_2_v4(df_tr[['ask_price','bid_price']],df_tr.pred,T,S,M,C)
train_stats_list.append([train_stats,T,M])
test_stats_list.append([test_stats,T,M])
except Exception:
pass
CPU times: total: 19min 20s Wall time: 19min 30s
def plot_stats(train_stats_list,test_stats_list,func_list,M=10000,C=0):
train_stats_T = [train_stats_list[i] for i,el in enumerate(train_stats_list) if train_stats_list[i][-1] == M]
test_stats_T = [test_stats_list[i] for i,el in enumerate(test_stats_list) if test_stats_list[i][-1] == M]
stats_list = [train_stats_T,test_stats_T]
n_size = len(stats_list) * len(func_list)
fig, ax = plt.subplots(n_size, 1, figsize = [10, 5 * n_size])
i = 0
for l,el in enumerate(stats_list):
for j,func in enumerate(func_list):
if l == 0:
title_name = 'train '
else:
title_name = 'test '
title_name = title_name + func
max_ = max([el[i][0][func] for i in range(len(el))])
min_ = min([el[i][0][func] for i in range(len(el))])
ax[i].set_title(title_name,fontsize = 20)
ax[i].plot([el[k][-2] for k in range(len(el))],\
[el[k][0][func] for k in range(len(el))],color='black', marker='.',label = f'{func},M={int(M)},C={round(C,5)}')
ax[i].axhline(y = max_, color ="red", linestyle ="--",label = f'max {func} = {max_}')
ax[i].legend()
ax[i].set_xlabel('T')
ax[i].set_ylabel(func)
if func in ['total_pnl','volume','trade_num','max_drop_down']:
ax[i].set_yscale('log')
ax[i].grid(True)
ax[i].set_xticks(np.arange(0.01,0.111,0.01))
i+=1
plt.tight_layout()
plt.show()
def step_3(df_bid_ask,y_pred,T=0.1,S=1,M=1000,C=0):
df_pnl = step_1_v2(df_bid_ask,y_pred,T,S,M,C)
if df_pnl.shape[0] == 0:
return 0
df_pnl['sign_side'] = df_pnl.side.apply(lambda x : -1 if x == 'sell' else 1)
df_pnl['current_num'] = df_pnl.apply(lambda x : x.trade_quantity * x.sign_side ,axis=1).cumsum()
df_pnl['pnl'] = ((df_pnl['trade_price'] - df_pnl['trade_price'].shift(1))* df_pnl.current_num.shift(1)).fillna(0)
df_pnl['pnl_fee'] = df_pnl['pnl'] - df_pnl['fee']
df_pnl['pnl_fee_v2'] = df_pnl['pnl_fee'].rolling(2).sum()
df_pnl['pnl_v2'] = df_pnl['pnl'].rolling(2).sum()
pnl_list = df_pnl.loc[1::2,'pnl_fee_v2']
pnl_fee_cumsum = df_pnl.loc[1::2,'pnl_fee_v2'].cumsum()
pnl_list_cumsum = df_pnl.loc[1::2,'pnl_v2'].cumsum()
volume_cumsum = (df_pnl.trade_price * df_pnl.trade_quantity).cumsum()
return pd.DataFrame({'pnl' : pnl_list,'pnl_cumsum':pnl_list_cumsum,'pnlfee_cumsum':pnl_fee_cumsum})
def plot_step_3(plot_type,df_bid_ask,y_pred,T=0.1,S=1,M=1000,C=0):
df = step_3(df_bid_ask,y_pred,T,S,M,C)
plt.figure(figsize=(10,5))
if plot_type == 1:
title_name = 'PnL cumsum'
col = 'pnl_cumsum'
elif plot_type == 2:
title_name = 'PnL_fee cumsum'
col ='pnlfee_cumsum'
else:
title_name = 'PnL hist'
col = 'pnl'
plt.title(title_name)
if plot_type != 3:
max_ = round(df[col].max(),3)
plt.plot(np.arange(df[col].shape[0]),df[col],color='black', marker='.',label = f'pnl,M={M},C={round(C,5)}')
plt.axhline(y = max_, color ="red", linestyle ="--",label = f'max PnL = {max_}')
plt.legend()
plt.grid()
plt.ylabel('Number of trade')
plt.xlabel('PnL')
plt.show()
else :
plt.hist(df[col],color='red',edgecolor='black',label = f'pnl,M={M},C={round(C,5)}',alpha=0.5,bins=12)
plt.legend()
plt.grid()
plt.xlabel('PnL')
plt.show()
plot_stats(train_stats_list,test_stats_list,['total_pnl','mean_pnl','sharp_ratio','positive_trade_ratio','trade_num',\
'max_drop_down'],M=10000,C=0)
step_2_v4(df_tr[['ask_price','bid_price']],df_tr.pred,T=0.09,M=10000,C=0)
{'total_pnl': 20.469,
'mean_pnl': 0.14117,
'std_pnl': 0.21347,
'sharp_ratio': 0.6613,
'volume': 10402.565,
'trade_num': 145.0,
'positive_trade_ratio': 0.77931,
'max_drop_down': 0.497}
step_2_v4(df_tt[['ask_price','bid_price']],df_tt.pred,T=0.09,M=10000,C=0.07/100)
{'total_pnl': 0.80021,
'mean_pnl': 0.02425,
'std_pnl': 0.27354,
'sharp_ratio': 0.08865,
'volume': 2301.131,
'trade_num': 33.0,
'positive_trade_ratio': 0.45455,
'max_drop_down': 1.21068}
step_2_v4(df_tt[['ask_price','bid_price']],df_tt.pred,T=0.09,M=10000,C=0)
{'total_pnl': 2.411,
'mean_pnl': 0.07306,
'std_pnl': 0.2723,
'sharp_ratio': 0.26831,
'volume': 2301.131,
'trade_num': 33.0,
'positive_trade_ratio': 0.54545,
'max_drop_down': 0.785}
plot_stats(train_stats_list,test_stats_list,['total_pnl','mean_pnl','sharp_ratio','positive_trade_ratio','trade_num'],M=M_list[0],C=0)
plot_stats(train_stats_list,test_stats_list,['total_pnl','mean_pnl','sharp_ratio','positive_trade_ratio','trade_num'],M=M_list[1],C=0)
plot_stats(train_stats_list,test_stats_list,['total_pnl','mean_pnl','sharp_ratio','positive_trade_ratio','trade_num'],M=M_list[2],C=0)
plot_stats(train_stats_list,test_stats_list,['total_pnl','mean_pnl','sharp_ratio','positive_trade_ratio','trade_num'],M=M_list[3],C=0)
plot_stats(train_stats_list,test_stats_list,['total_pnl','mean_pnl','sharp_ratio','positive_trade_ratio','trade_num'],M=M_list[4],C=0)
plot_stats(train_stats_list,test_stats_list,['total_pnl','mean_pnl','sharp_ratio','positive_trade_ratio','trade_num'],M=10000,C=0)
for i in range(3):
plot_step_3(i + 1,df_tr[['ask_price','bid_price']],df_tr.pred,T=0.09,M=10000,C=0.)
for i in range(3):
plot_step_3(i + 1,df_tt[['ask_price','bid_price']],df_tt.pred,T=0.09,M=10000,C=0.)
step_2_v4(df_ot[['ask_price','bid_price']],df_ot.pred,T=0.09,M=10000,C=0)
{'total_pnl': 2.829,
'mean_pnl': 0.02482,
'std_pnl': 0.21261,
'sharp_ratio': 0.11672,
'volume': 6995.269,
'trade_num': 114.0,
'positive_trade_ratio': 0.55263,
'max_drop_down': 1.647}
for i in range(3):
plot_step_3(i + 1,df_ot[['ask_price','bid_price']],df_ot.pred,T=0.09,M=10000,C=0.)
plot_stats(train_stats_list,test_stats_list,['total_pnl','mean_pnl','sharp_ratio','positive_trade_ratio','trade_num',\
'max_drop_down'],M=10000,C=0.07/100)
step_2_v4(df_tr[['ask_price','bid_price']],df_tr.pred,T=0.09,M=10000,C=0.07/100)
{'total_pnl': 13.1872,
'mean_pnl': 0.09095,
'std_pnl': 0.21433,
'sharp_ratio': 0.42434,
'volume': 10402.565,
'trade_num': 145.0,
'positive_trade_ratio': 0.63448,
'max_drop_down': 0.71376}
step_2_v4(df_tt[['ask_price','bid_price']],df_tt.pred,T=0.09,M=10000,C=0.07/100)
{'total_pnl': 0.80021,
'mean_pnl': 0.02425,
'std_pnl': 0.27354,
'sharp_ratio': 0.08865,
'volume': 2301.131,
'trade_num': 33.0,
'positive_trade_ratio': 0.45455,
'max_drop_down': 1.21068}
for i in range(3):
plot_step_3(i + 1,df_tr[['ask_price','bid_price']],df_tr.pred,T=0.09,M=10000,C=0.07/100)
for i in range(3):
plot_step_3(i + 1,df_tt[['ask_price','bid_price']],df_tt.pred,T=0.09,M=10000,C=0.07/100)